library(tidyverse)
library(DESeq2)
library(factoextra)
library(ggplot2)
library(ggrepel)
library(EnsDb.Hsapiens.v75)
library(clusterProfiler)
library(DOSE)
library(msigdbr)
library(enrichplot)
library(org.Hs.eg.db)
prep_vulcano_plot_data = function(df, pval=0.05, fc=1, bm=25){
df$diffexpressed = "NO"
df$diffexpressed[df$log2FoldChange > 1 & df$padj < pval & df$baseMean > bm] = "UP"
df$diffexpressed[df$log2FoldChange < -1 & df$padj < pval & df$baseMean > bm] = "DOWN"
return(df)
}
vulcano_plot = function(df, title, log2fc = 1, pval=0.05, label_top_n = 20){
p = df %>%
rownames_to_column() %>%
mutate(SYMBOL = ifelse(padj > (df %>% dplyr::select(padj) %>% top_n(-label_top_n) %>% top_n(1) %>% unique() %>% as.numeric()), NA, SYMBOL)) %>%
ggplot(aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=SYMBOL)) +
geom_point(size=2, alpha=0.7) +
scale_color_manual(values=c("#067BC2", "black", "#F37748"),
name="Change",
labels=c("Down", "No Change", "Up")) +
geom_vline(xintercept=c(-log2fc, log2fc), col="red") +
geom_hline(yintercept=-log10(pval), col="red") +
geom_text_repel(show.legend = F, box.padding = 0.5, max.overlaps = Inf) +
theme_bw() +
theme(text = element_text(size = 25),
axis.line = element_line(size = 0.5),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background = element_blank(),
legend.title = element_blank()) +
ggtitle(paste(title, " (p-val=", pval, ")", sep="")) +
xlab("log2 Fold Change") +
ylab("-log10 Adjusted p-Value")
return(p)
}
run_GSEA = function(gene_list, pval=0.1, cat, subcat=NA, minset=10){
set.seed(321)
#TERM2GENE
term2gene = msigdbr(species = "human", category = cat, subcategory = subcat) %>%
dplyr::select(c("gs_name", "human_ensembl_gene")) %>%
mutate(across('gs_name', str_replace, 'GOBP_', ''))
#run gsea
gse = GSEA(gene_list,
minGSSize = minset,
maxGSSize = 500,
pvalueCutoff = pval,
pAdjustMethod = "BH",
TERM2GENE = term2gene,
verbose = TRUE,
seed = TRUE)
gse=setReadable(gse,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL")
return(gse)
}
create_gene_list = function(df, pval=0.05, fc=1, bm=100, direction="BOTH"){
#create gene list with entrez ID as rownmanes and fold change as values
tmp_gene_list = df %>%
filter(!is.na(padj)) %>%
dplyr::select(log2FoldChange, ID, padj, baseMean) %>%
filter(padj <= pval & (log2FoldChange >= fc | log2FoldChange <= fc) & baseMean >= bm) %>%
separate(ID, into=c("ID", "Symbol"), sep=":") %>%
na.omit() %>%
arrange(desc(log2FoldChange)) %>%
dplyr::select(-padj)
if(direction == "UP"){
tmp_gene_list = tmp_gene_list %>%
filter(log2FoldChange > 0)
}
else if(direction == "DOWN"){
tmp_gene_list = tmp_gene_list %>%
filter(log2FoldChange < 0)
}
gene_list = tmp_gene_list$log2FoldChange
names(gene_list) = tmp_gene_list$ID
return(gene_list)
}
PATH_DATA = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/RNA/patient_triceps/data/osteosarcoma_counts_triceps_signature_profyle.tsv"
PATH_META_SAMPLE = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/files/sample_meta.tsv"
PATH_META_PATIENT = "/lustre06/project/6004736/alvann/from_narval/220318_EZHIP/files/patient_meta.tsv"
meta_pat = data.table::fread(PATH_META_PATIENT) %>% arrange(Patient_N)
meta_pat
meta_samp = data.table::fread(PATH_META_SAMPLE)
set.seed(321)
meta = full_join(meta_samp, meta_pat, by = 'Patient_N') %>%
mutate(Response = tolower(Response),
Primary_location = tolower(Primary_location)) %>%
filter(!is.na(RNA_triceps_signature_profyle)) %>%
filter(!is.na(Patient_N)) %>%
group_by(Patient_N) %>%
sample_n(1)
exp = data.table::fread(PATH_DATA) %>%
column_to_rownames('ensembl_id')
#rename duplicated samples
x = sapply((exp %>% names %>% strsplit(split = '_')),"[[",2)
x[x == "PRO-00316"][1] = "PRO-00316_A"
x[x == "PRO-00316"][1] = "PRO-00316_B"
x[x == "TCMG198"][1] = "TCMG198_A"
x[x == "TCMG198"][1] = "TCMG198_B"
x[x == "TCMG198"][1] = "TCMG198_C"
x[x == "SISJ0252"][1] = "SISJ0252_A"
x[x == "SISJ0252"][1] = "SISJ0252_B"
names(exp) = x
exp = exp %>%
select(meta$RNA_ID_triceps_signature_profyle)
exp
meta %>%
select(EZHIP_Status, Patient_N) %>%
distinct() %>%
dplyr::count(EZHIP_Status) %>%
ggblanket::gg_bar(x = EZHIP_Status, y = n, stat = 'identity')
# Normalize gene expression
meta = meta[match(colnames(exp), meta$RNA_ID_triceps_signature_profyle),]
rownames(meta) = meta$RNA_ID_triceps_signature_profyle
rownames(meta) == colnames(exp)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE
dds = DESeqDataSetFromMatrix(countData=exp,
colData=meta,
design=~1)
dds = DESeq(dds)
EZHIP = counts(dds, normalized=T) %>%
as.data.frame() %>%
dplyr::filter(rownames(.) == 'ENSG00000187690') %>%
t %>%
as.data.frame() %>%
base::merge(colData(dds) %>% as.data.frame() %>% dplyr::select(EZHIP_Status), by = "row.names") %>%
mutate(EZHIP = log2(ENSG00000187690+1))
EZHIP %>% arrange(desc(EZHIP))
EZHIP %>%
ggblanket::gg_boxplot(x = EZHIP_Status, y = EZHIP, y_title = 'EZHIP log2 normalized expression', size = 2) +
geom_point(data = EZHIP, aes(x = EZHIP_Status, y = EZHIP))
EZHIP = EZHIP %>% dplyr::filter(EZHIP_Status == 'positive' | (EZHIP_Status == 'negative' & EZHIP == 0))
meta = meta %>% dplyr::filter(RNA_ID_triceps_signature_profyle %in% EZHIP$Row.names)
exp = exp %>% dplyr::select(meta$RNA_ID_triceps_signature_profyle)
rownames(meta) == colnames(exp)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
meta %>%
dplyr::count(EZHIP_Status, Sex)
exp = exp %>%
rownames_to_column('ID')
geneIDs1 = ensembldb::select(EnsDb.Hsapiens.v75, keys= exp$ID, keytype = "GENEID", columns = c("SYMBOL","GENEID")) %>%
mutate(ENS_SYM = paste0(GENEID,':', SYMBOL))
exp = exp %>% merge(geneIDs1, by.x = 'ID', by.y = 'GENEID') %>%
column_to_rownames('ENS_SYM') %>%
dplyr::select(!c(ID, SYMBOL))
dds = DESeqDataSetFromMatrix(countData=exp,
colData=meta,
design=~EZHIP_Status)
dds = DESeq(dds)
res = results(dds) %>%
as.data.frame()
res %>%
rownames_to_column('ID') %>%
dplyr::filter(grepl('PAGE', ID)) %>%
arrange(padj)
var = counts(dds, normalized=TRUE) %>%
as.data.frame() %>%
apply(1,var) %>%
as.data.frame() %>%
top_frac(n = 0.5, .)
pca = counts(dds, normalized=TRUE) %>%
as.data.frame() %>%
rownames_to_column('gene_id') %>%
dplyr::filter(gene_id %in% rownames(var)) %>%
column_to_rownames('gene_id') %>%
t() %>%
as.data.frame() %>%
prcomp()
fviz_eig(pca)
cowplot::plot_grid(fviz_pca_ind(pca,
axes = c(1, 2),
habillage = colData(dds)$EZHIP_Status, # color by groups
palette = c("#067BC2", "#D56062", "#FFD670"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE,
geom='point'),
fviz_pca_ind(pca,
axes = c(2, 3),
habillage = colData(dds)$EZHIP_Status, # color by groups
palette = c("#067BC2", "#D56062", "#FFD670"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE,
geom='point'),
fviz_pca_ind(pca,
axes = c(3, 4),
habillage = colData(dds)$EZHIP_Status, # color by groups
palette = c("#067BC2", "#D56062", "#FFD670"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE,
geom='point'
),
fviz_pca_ind(pca,
axes = c(4, 5),
habillage = colData(dds)$EZHIP_Status, # color by groups
palette = c("#067BC2", "#D56062", "#FFD670"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = TRUE,
geom='point')
)
results(dds) %>%
as.data.frame() %>%
rownames_to_column('ID') %>%
separate('ID', into = c('ENS', 'SYMBOL'), sep = ':') %>%
prep_vulcano_plot_data(., bm = 0) %>%
filter(!diffexpressed == 'NO')
results(dds) %>%
as.data.frame() %>%
rownames_to_column('ID') %>%
separate('ID', into = c('ENS', 'SYMBOL'), sep = ':') %>%
prep_vulcano_plot_data(., bm = 25) %>%
vulcano_plot(., title = 'c', label_top_n = 50) +
theme(legend.position = 'none',
plot.title = element_blank())
# GSEA
gene_list_tmp = res %>%
filter(!is.na(padj)) %>%
rownames_to_column('ID') %>%
separate(ID, into = c('ENS')) %>%
dplyr::select(ENS, log2FoldChange) %>%
arrange(desc(log2FoldChange))
gene_list = gene_list_tmp$log2FoldChange
names(gene_list) = gene_list_tmp$ENS
GO = run_GSEA(gene_list,
cat="C5",
subcat="BP",
pval = 0.05)
GO %>% as.data.frame()
dotplot(GO, showCategory = 30, x = 'NES') +
scale_colour_gradient2(high = "#E3E9C2", mid = "#E53D00") +
coord_cartesian(xlim = c(-2, 2))